{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 读取数据\n",
    "import pandas as pd\n",
    "import warnings\n",
    "warnings.filterwarnings('ignore')\n",
    "data=pd.read_csv('./data/train_tag.csv',encoding='gbk')\n",
    "data.drop_duplicates(inplace=True)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['id']\n"
     ]
    }
   ],
   "source": [
    "# 删除无关特征\n",
    "drop_list=[]\n",
    "for i in data.columns:\n",
    "    count=data[i].count()\n",
    "    if len(list(data[i].unique())) in [1,count,count-1]:\n",
    "        drop_list.append(i)\n",
    "print(drop_list)\n",
    "data.drop(drop_list,axis=1,inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<class 'pandas.core.frame.DataFrame'>\n",
      "Int64Index: 39923 entries, 0 to 39922\n",
      "Data columns (total 42 columns):\n",
      " #   Column                           Non-Null Count  Dtype  \n",
      "---  ------                           --------------  -----  \n",
      " 0   flag                             39923 non-null  int64  \n",
      " 1   gdr_cd                           39187 non-null  object \n",
      " 2   age                              39923 non-null  int64  \n",
      " 3   mrg_situ_cd                      39187 non-null  object \n",
      " 4   edu_deg_cd                       26751 non-null  object \n",
      " 5   acdm_deg_cd                      39186 non-null  object \n",
      " 6   deg_cd                           18224 non-null  object \n",
      " 7   job_year                         39430 non-null  float64\n",
      " 8   ic_ind                           39430 non-null  float64\n",
      " 9   fr_or_sh_ind                     39430 non-null  float64\n",
      " 10  dnl_mbl_bnk_ind                  39430 non-null  float64\n",
      " 11  dnl_bind_cmb_lif_ind             39430 non-null  float64\n",
      " 12  hav_car_grp_ind                  39423 non-null  float64\n",
      " 13  hav_hou_grp_ind                  39430 non-null  float64\n",
      " 14  l6mon_agn_ind                    39423 non-null  float64\n",
      " 15  frs_agn_dt_cnt                   39423 non-null  float64\n",
      " 16  vld_rsk_ases_ind                 39423 non-null  float64\n",
      " 17  fin_rsk_ases_grd_cd              39430 non-null  float64\n",
      " 18  confirm_rsk_ases_lvl_typ_cd      39430 non-null  float64\n",
      " 19  cust_inv_rsk_endu_lvl_cd         39423 non-null  float64\n",
      " 20  l6mon_daim_aum_cd                39923 non-null  int64  \n",
      " 21  tot_ast_lvl_cd                   39423 non-null  float64\n",
      " 22  pot_ast_lvl_cd                   39423 non-null  float64\n",
      " 23  bk1_cur_year_mon_avg_agn_amt_cd  39923 non-null  int64  \n",
      " 24  l12mon_buy_fin_mng_whl_tms       39423 non-null  float64\n",
      " 25  l12_mon_fnd_buy_whl_tms          39423 non-null  float64\n",
      " 26  l12_mon_insu_buy_whl_tms         39423 non-null  float64\n",
      " 27  l12_mon_gld_buy_whl_tms          39423 non-null  float64\n",
      " 28  loan_act_ind                     39430 non-null  float64\n",
      " 29  pl_crd_lmt_cd                    39923 non-null  int64  \n",
      " 30  ovd_30d_loan_tot_cnt             39430 non-null  float64\n",
      " 31  his_lng_ovd_day                  39430 non-null  float64\n",
      " 32  hld_crd_card_grd_cd              39430 non-null  float64\n",
      " 33  crd_card_act_ind                 39423 non-null  float64\n",
      " 34  l1y_crd_card_csm_amt_dlm_cd      39423 non-null  float64\n",
      " 35  atdd_type                        16259 non-null  float64\n",
      " 36  perm_crd_lmt_cd                  39923 non-null  int64  \n",
      " 37  cur_debit_cnt                    39923 non-null  int64  \n",
      " 38  cur_credit_cnt                   39923 non-null  int64  \n",
      " 39  cur_debit_min_opn_dt_cnt         39923 non-null  int64  \n",
      " 40  cur_credit_min_opn_dt_cnt        39923 non-null  int64  \n",
      " 41  cur_debit_crd_lvl                39923 non-null  int64  \n",
      "dtypes: float64(26), int64(11), object(5)\n",
      "memory usage: 13.1+ MB\n"
     ]
    }
   ],
   "source": [
    "# 了解数据整体情况\n",
    "data.info()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "atdd_type                      0.592741\n",
      "l12_mon_gld_buy_whl_tms        0.012524\n",
      "l12_mon_insu_buy_whl_tms       0.012524\n",
      "frs_agn_dt_cnt                 0.012524\n",
      "vld_rsk_ases_ind               0.012524\n",
      "cust_inv_rsk_endu_lvl_cd       0.012524\n",
      "hav_car_grp_ind                0.012524\n",
      "tot_ast_lvl_cd                 0.012524\n",
      "pot_ast_lvl_cd                 0.012524\n",
      "l12mon_buy_fin_mng_whl_tms     0.012524\n",
      "l12_mon_fnd_buy_whl_tms        0.012524\n",
      "l6mon_agn_ind                  0.012524\n",
      "crd_card_act_ind               0.012524\n",
      "l1y_crd_card_csm_amt_dlm_cd    0.012524\n",
      "dnl_mbl_bnk_ind                0.012349\n",
      "loan_act_ind                   0.012349\n",
      "hav_hou_grp_ind                0.012349\n",
      "ovd_30d_loan_tot_cnt           0.012349\n",
      "his_lng_ovd_day                0.012349\n",
      "hld_crd_card_grd_cd            0.012349\n",
      "dtype: float64\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "<matplotlib.axes._subplots.AxesSubplot at 0x15ce5550dc8>"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZ0AAAD5CAYAAAD8zehaAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3dfZxdVX3v8c9vzsxJzqRAxEQtSSBRYhGqQRlCfUABLwoqDVgqYIvFSrmoSOkt90pvn+z16vXx9mpB08grBVvaoFY0hQBXQQRFIBMMCQlEYkAyBmR4SiSZzOOvf6x1kp2TMzNnJmdm9tp8369XXjlnn3X2/u2919q/vdbec7a5OyIiIpOhZaoDEBGRFw8lHRERmTRKOiIiMmmUdEREZNIo6YiIyKRpnaoFz5o1y+fPnz9VixcRSdKaNWuedvfZUx3HeE1Z0pk/fz6dnZ1TtXgRkSSZ2S+mOoYDoeE1ERGZNEo6IiIyaZR0RERk0jSUdMzsNDPbZGabzeyKYcqcZGZrzWyDmf2wuWGKiEgRjHojgZmVgKuAU4EuYLWZrXT3jZkyM4GvAKe5++Nm9rKJClhERNLVSE9nMbDZ3be4ex+wAlhSU+b9wLfd/XEAd3+quWGKiEgRNJJ05gBbM++74rSsVwMvMbM7zGyNmX2g3ozM7CIz6zSzzu7u7vFFLCIiyWok6VidabXPQ2gFjgPeDbwT+Gsze/V+X3Jf5u4d7t4xe/b4/rbpsad3cu3dj7G9p39c3xcRkanTSNLpAuZl3s8FttUpc4u773T3p4E7gUXNCXFfG5/Ywd+u3MCT23dPxOxFRGQCNZJ0VgMLzWyBmZWBc4GVNWW+C5xoZq1m1g6cADzU3FCDSrkEwM6+gYmYvYiITKBR715z9wEzuwS4FSgBy919g5ldHD9f6u4PmdktwDpgCLja3R+ciIDb20LS6ekbnIjZi4jIBGrot9fcfRWwqmba0pr3nwc+37zQ6msvh5B3KemIiCQnuV8kaJ8Wejq7NLwmIpKc9JJOWcNrIiKpSi/ptIXhtZ1KOiIiyUku6VT29HQ0vCYikprkkk65tYXWFtONBCIiCUou6UC4rqOkIyKSnkSTTqtuJBARSVCiSaekXyQQEUlQkkmnUi6ppyMikqAkk46u6YiIpCnRpNPKrn4lHRGR1CSadEr6Ox0RkQQlmXQqGl4TEUlSkklH13RERNKUaNJp1a9Mi4gkKMmkU2krsbt/iKEhn+pQRERkDJJMOjPiM3V6dAebiEhSkkw6FT09VEQkSUkmnfY2PT1URCRFaSadcjXpqKcjIpKSJJNORUlHRCRJSSadGdPCNR396KeISFqSTDoVXdMREUlSkklH13RERNLUUNIxs9PMbJOZbTazK+p8fpKZbTeztfHf3zQ/1L3adcu0iEiSWkcrYGYl4CrgVKALWG1mK919Y03Ru9z9PRMQ43723kig4TURkZQ00tNZDGx29y3u3gesAJZMbFgjqw6v6UYCEZG0NJJ05gBbM++74rRabzSzB8zsZjM7pt6MzOwiM+s0s87u7u5xhBu0lVool1r0IDcRkcQ0knSszrTaX9q8HzjC3RcB/wB8p96M3H2Zu3e4e8fs2bPHFmmNSrmkno6ISGIaSTpdwLzM+7nAtmwBd9/h7i/E16uANjOb1bQo62gvl9jZq2s6IiIpaSTprAYWmtkCMysD5wIrswXM7BVmZvH14jjfZ5odbFalXNLwmohIYka9e83dB8zsEuBWoAQsd/cNZnZx/HwpcDbwYTMbAHqAc919Qh92067hNRGR5IyadGDPkNmqmmlLM6+vBK5sbmgj09NDRUTSk+QvEoB6OiIiKUo66exU0hERSUqySafS1qqejohIYpJNOu3lkq7piIgkJt2kM62kH/wUEUlMukmnrZXegSEGhyb0zmwREWmidJOOfmlaRCQ5ySadin5pWkQkOckmHT09VEQkPQknHT09VEQkNQknnTi81q9rOiIiqUg+6ainIyKSjmSTTvVGgp29SjoiIqlINulUr+loeE1EJB0JJx0Nr4mIpCb5pKO/0xERSUfCSUe3TIuIpCbZpFNqMcqtLezUz+CIiCQj2aQDenqoiEhq0k46bXq8gYhIStJOOtP09FARkZSknXT09FARkaQknXQqbSV2qqcjIpKMpJOObiQQEUlLQ0nHzE4zs01mttnMrhih3PFmNmhmZzcvxOG1l1s1vCYikpBRk46ZlYCrgNOBo4HzzOzoYcp9Fri12UEORz0dEZG0NNLTWQxsdvct7t4HrACW1Cn3MeDfgaeaGN+I2ssldvUr6YiIpKKRpDMH2Jp53xWn7WFmc4CzgKXNC210lXKr/k5HRCQhjSQdqzPNa97/P+Dj7j5iBjCzi8ys08w6u7u7G41xWO3lEn0DQwwMDh3wvEREZOK1NlCmC5iXeT8X2FZTpgNYYWYAs4B3mdmAu38nW8jdlwHLADo6OmoT15jtebxB/yAHl5K+EU9E5EWhkaSzGlhoZguAXwLnAu/PFnD3BdXXZnYNcGNtwpkIlczjDQ6e3jbRixMRkQM0atJx9wEzu4RwV1oJWO7uG8zs4vj5pF7HyZqhxxuIiCSlkZ4O7r4KWFUzrW6ycfcLDjysxlT2PD1Uf6sjIpKCpC+E6JHVIiJpUdIREZFJk3TSqbSF0cEeDa+JiCQh6aQzY5p6OiIiKUk66VQ0vCYikpSkk077nlumNbwmIpKCpJNOpU09HRGRlCSddEotxrTWFj3eQEQkEUknHYiPN1DSERFJQgGSjh5vICKSigIknRI9/bqRQEQkBYVIOjt71dMREUlB8kmnUi7pRgIRkUQkn3Tay63s0vCaiEgSCpB0dPeaiEgqCpF0NLwmIpKGAiSdVnb2anhNRCQFySedSrlET796OiIiKUg+6bS3legfdPoHh6Y6FBERGUX6SWda9Zem1dsREcm79JNOfKaObiYQEcm/wiSdnXqmjohI7iWfdKrP1FFPR0Qk/5JPOnufHqqkIyKSd8knnUq5+vRQDa+JiORdQ0nHzE4zs01mttnMrqjz+RIzW2dma82s08ze0vxQ65sxTcNrIiKpaB2tgJmVgKuAU4EuYLWZrXT3jZlitwEr3d3N7HXAN4CjJiLgWu1tGl4TEUlFIz2dxcBmd9/i7n3ACmBJtoC7v+DuHt/OAJxJouE1EZF0NJJ05gBbM++74rR9mNlZZvYwcBPwx/VmZGYXxeG3zu7u7vHEu5/2PUlHPR0RkbxrJOlYnWn79WTc/QZ3Pwo4E/hkvRm5+zJ373D3jtmzZ48t0mFUb5lW0hERyb9Gkk4XMC/zfi6wbbjC7n4n8Cozm3WAsTWkpcWotOlHP0VEUtBI0lkNLDSzBWZWBs4FVmYLmNmRZmbx9RuAMvBMs4MdTniQm67piIjk3ah3r7n7gJldAtwKlIDl7r7BzC6Ony8Ffg/4gJn1Az3AOZkbCyZcpVxiV696OiIieTdq0gFw91XAqpppSzOvPwt8trmhNU6PrBYRSUPyv0gAUCm3skvXdEREcq8QSWdGuUSPrumIiOReIZKOhtdERNJQiKRTKbfqt9dERBJQiKTT3lbSQ9xERBJQiKRT0fCaiEgSCpF02sslDa+JiCSgEElnxrRWBoacvoGhqQ5FRERGUIikU/3RT/V2RETyrRBJp/p4A91MICKSb4VIOhU9U0dEJAmFSDrt5fATchpeExHJt0IknRl6ZLWISBIKkXT2DK/pRz9FRHKtEEmnOrymZ+qIiORbQZKOhtdERFJQiKRTHV7r0fCaiEiuFSLpzKgOr+nuNRGRXCtE0pne1oKZko6ISN4VIumYGZU2PT1URCTvCpF0INxMsFM9HRGRXCtM0qno8QYiIrlXmKTT3taqW6ZFRHKuOElnmp4eKiKSdw0lHTM7zcw2mdlmM7uizud/YGbr4r+7zWxR80MdmZ4eKiKSf6MmHTMrAVcBpwNHA+eZ2dE1xR4F3uburwM+CSxrdqCjqbS16kYCEZGca6SnsxjY7O5b3L0PWAEsyRZw97vd/bn49h5gbnPDHF3o6eiajohInjWSdOYAWzPvu+K04XwIuLneB2Z2kZl1mllnd3d341E2oL2sazoiInnXSNKxOtO8bkGzkwlJ5+P1Pnf3Ze7e4e4ds2fPbjzKBrSXW3VNR0Qk51obKNMFzMu8nwtsqy1kZq8DrgZOd/dnmhNe49rLJXb1D+LumNXLkyIiMtUa6emsBhaa2QIzKwPnAiuzBczscODbwPnu/rPmhzm6SrnE4JDTOzA0FYsXEZEGjNrTcfcBM7sEuBUoAcvdfYOZXRw/Xwr8DfBS4CuxlzHg7h0TF/b+qs/U6ekbZHpbaTIXLSIiDWpkeA13XwWsqpm2NPP6QuDC5oY2Nu2ZR1a/ZCoDERGRYRXnFwniM3V027SISH4VKOlUH1mtO9hERPKqMEmnoqQjIpJ7hUk67XseWa3hNRGRvCpQ0lFPR0Qk7wqTdCptSjoiInlXmKQzY1r17jUlHRGRvCpM0tHwmohI/hUm6UxrbcFMNxKIiORZYZKOmdHepscbiIjkWWGSDkCl3KqkIyKSY4VKOjOm6emhIiJ5VqikU9HwmohIrhUq6eiR1SIi+VawpNOqu9dERHKsUEmnop6OiEiuFSrpzCiX6OlX0hERyatCJR3dMi0ikm+FSjrt5ZJ+e01EJMcKl3R29g3g7lMdioiI1FGopFMpl3CH3oGhqQ5FRETqKFTSadczdUREcq1YSWeaHlktIpJnxUo68Zk6uplARCSfGko6ZnaamW0ys81mdkWdz48ys5+YWa+ZXd78MBtTTTo7lXRERHKpdbQCZlYCrgJOBbqA1Wa20t03Zoo9C1wKnDkhUTao0qbhNRGRPGukp7MY2OzuW9y9D1gBLMkWcPen3H010D8BMTZMw2siIvnWSNKZA2zNvO+K03JnxjTdvSYikmeNJB2rM21cf31pZheZWaeZdXZ3d49nFiOqlMPwmno6IiL51EjS6QLmZd7PBbaNZ2HuvszdO9y9Y/bs2eOZxYiqf6ezU9d0RERyqZGksxpYaGYLzKwMnAusnNiwxqdS1vCaiEiejXr3mrsPmNklwK1ACVju7hvM7OL4+VIzewXQCRwMDJnZZcDR7r5jAmPfz7TWFlpMw2siInk1atIBcPdVwKqaaUszr58kDLtNKTNjhh5vICKSW4X6RQIIQ2w9/bqmIyKSR4VLOu16ZLWISG4VLulUyq3s7FXSERHJo8IlnXYNr4mI5FYhk46G10RE8qmQSUe3TIuI5FMBk45umRYRyavCJZ1KuaRHG4iI5FThkk57m67piIjkVfGSTrlET/8g7uP6IWwREZlAxUs601pxh939Q1MdioiI1Che0tnzS9O6riMikjeFSzqVNj3eQEQkrwqXdNrj00OVdERE8qeASUfDayIieVW4pFN9eqh+lUBEJH8Kl3RmaHhNRCS3Cpd0qj2dXf1KOiIieVO4pLPnmk6vrumIiORNcZOOhtdERHKncElnz40EGl4TEcmdwiWdcqmF1hbTLdMiIjlUuKRjZvHxBurpiIjkTetUBzARZpRbue7ex/nu2m1THYrUMMKJQYtBS/zfzLD43myqI3xxMeDNR87i4re9inmHtk91OPIiUMikc8XpR7HmF89NdRhSw3HcwQF3Z2gIhtwZ8vDZ0JAeRzHZevoH+WZnF9ev3spZr5/DR04+kgWzZkx1WFJg1shzZ8zsNOBLQAm42t0/U/O5xc/fBewCLnD3+0eaZ0dHh3d2do43bhFpkie37+Yf7/w5/3rv4/QPDnHGosO45OQjWfjyg6Y6NKnDzNa4e8dUxzFeoyYdMysBPwNOBbqA1cB57r4xU+ZdwMcISecE4EvufsJI81XSEcmX7l/3cvWPtvDPP/kFPf2DnHbMK7jklCM55rBDpjo0yUg96TQyvLYY2OzuWwDMbAWwBNiYKbME+LqHDHaPmc00s9909yeaHrGITIjZB03jL05/DRe/9VUs//GjXPPjx7j5wSc5dt7MPX//Js1xxqLDOG/x4VMdxpRoJOnMAbZm3ncRejOjlZkD7JN0zOwi4CKAww9/cW5wkbx7yYwyf/6O3+LCE1/JtXc/xo8eeZr+QT2Jt5kGX8TXLxtJOvXuJ6rdYo2Uwd2XAcsgDK81sGwRmSKHVNq49O0LufTtC6c6FCmQRv5OpwuYl3k/F6i9F7mRMiIi8iLXSNJZDSw0swVmVgbOBVbWlFkJfMCC3wG263qOiIjUGnV4zd0HzOwS4FbCLdPL3X2DmV0cP18KrCLcubaZcMv0BycuZBERSVVDfxzq7qsIiSU7bWnmtQMfbW5oIiJSNIX77TUREckvJR0REZk0SjoiIjJplHRERGTSNPSDnxOyYLNu4Bfj/Pos4OkmhjORUolVcTZfKrEqzuaa6DiPcPfZEzj/CTVlSedAmFlnKj94l0qsirP5UolVcTZXKnFOFQ2viYjIpFHSERGRSZNq0lk21QGMQSqxKs7mSyVWxdlcqcQ5JZK8piMiImlKtacjIiIJUtIREZFJo6QjIiKTZlKTjpn9zxE++4SZXV5n+nwze3C475vZsWb2rsz7u8cY0wuZ17eY2fNmdmNNmevMbJOZPWhmy82srebza8zs7LEsd7zM7A4z64ivp5nZ981srZmdM8r39mxfM3vMzGYNM+8zqtt7lPm9MMz0uvNuVE2c+9SJ2mWa2UnVfRXryYVm9pF688pMO8zMbjSzb40hpgvidr5yvOs1yvzr1v1hys43s13xdYeZfTm+rtaFzWa2ZpjvXm1mR48xtrr7uV6ZuG2/FV/v0y4PRG3cjWyv4dpkzTY7ycze1KQYHzCzZfH1mWPdzg3M/zIzaz/AeWTb1gVmdlhzotsz/2GP71mT3dNpKKgxfv9YwrN8AHD3A6lEnwfOrzP9OuAoYBFQAS48gGU00+uBNnc/1t2vn+pgpth84DzgIyMVcvdt7v4ed5+Uk4QJsgXA3Tvd/dI47fVAG6FuPgFgZqXsl9z9Qnff2MgCzKyhx57UzH+bu58dl7tPu2xwmaV608cSdwMxZrfZSUBTkg7w90BffH0m0NSkA1wGHFDSqXEB0NSkQ6PHd3efkH/Ad4A1wAbgIuAzwBDQAzwH/DNwP/BL4PvAvwG98bvvBF4AdgLdhEb2GWAQWAtcF8uVgcdjmbXAOXEZswkV6o44n0eAnwNLgQeBXwPvifPoITx8bjWwDvgicGNmPV4A/hdwL/CWGGt3LPuFWOYa4Oz4+odxfv8feAx4L/A5YD1wC3BCLPMzYAfwELA8Tvss0As8A2yK31kErIjLuz7G0QG8LK7r9rjuH42x/l3cruuBo2JML43r/8s4j15gVvxsPrAuvr4DOAN4GLg2LvNbQHv8/FfAA/HfYCw/i9DYdse4q/vrCOC2TCxLCA8BfDyu87WAA/cREvldwJHxO9sJdeQF4KdxOz0Vyz8CnAi8PH5nR4znwRjHUFzPVXE9uuI+fgr4BqE+9sQYLojxDMTv9QGfi+v6QeBJwkMJn4jfu3KE+v5y4IbM9nlT3LYPA1fH+K4D/gvw47gei+N3P0FoD7fH6X9SZ/4fiPtjY4z1GuBHcf2r07bHdd8BbI37eSnQktm/HfH17lhuIO6zl8e41sf5PB/X3+PndxFOvL4a49gVl/XJuJ8eA/4voc3+AaEuDMRtfSeZNpJpV58gtIln4/ut7NtW2jJxf4XQJgbivtgG3AMsjmXXVGPMtMl/ievRS2gjhxOOC1vjvugD+gn19URCffh2nN8jMZZr4jr/IJZ7G6G9PhS3/88IbffXhP38XkLb2BHX/S6gUmd/Hkk4lmyK2+zhWPZHhDa3K067O65rf9wuj8bl3xHjuXSU4/BfxmVUj7GXA2fH7b0pbpf94ovfPT4u/wFCOz2o3jaKZfc7Pg8b0wQmnUPj/xVCg3sToWFUD3YnEw4s7wcOJhyoq0lnG/C1+PoLwMZqRa2znAvIHAxiBbssVq6dhINPC+Ggcw/hbHAbocFMjxvqm/G70+KOuC0zPwfeF1+/jNBYT4zvZ2aTDqGSdsaK00ZIGLuA0zOJ+GFgLqHiXxYr0Nfj+n+R0Hi/GivJR2LMy+P3X0dodNUDRw8xQWYq0sfi+48AV8fXXyY0msuBd8d1Oi5+9nHgr2qSjgNvjtOWx+/Njst+Q2bZdwA3EQ4af0Y42Dgh4bYCB8eys+L6GaGBDgAfIyT69XEfPgocR0hs98TvbInb74dx2/Rlts31wJXAjYRk9m5CwnowUy+ejfOfRzgIPEs4SD1LSCQXEPb/Y3HfPk5IUsfF/7sIZ4N3x/IjJZ3rgcvi6xJwCCHpDACvJdTBNXF7GiEJfyeTdB4gtJVZhLpxWGbexxDq5aw4z8G4P38e1//lhLb1PUKdcuCtMY7vsfeE6A7CCcsxscwH4zyuBP6K0EaeBq4A/oO9B8SvxBhvBw4lPJ7+j+L8PsXepPN/CG39mLi9qvXvUIZPOuvjMk5n37ZyA3BmfN1JqD/tMe5thDr5uRj/wljuBOD2TJt8Ku7jhYR9vpK9SecWwgnaZ+J+nh7Lbon7bjrhtyG/QTjpq+6zHXF/HkaojycTTn53E5LO/LgvLo9xfAP4wzr15V7gLEICPzWu27WE49fcuG1/RTjRfWtc1qy4ze4mHKtmEZJ72zB18ri4fdvZe4y9PFsXRqjP5bgtjo/vDya06XrbaN5wx+d6/yZyeO1SM3uAcACZB/weMODu1R/CWxQD7nP3HYQKgZkdQmgsJ5vZJwg7Z2gMyx0gnBVCOPv4irsPESrdOnfvJzTUHYTG0QIca2Zr47IOBmZk5jcI/Ht8/ek4zwvM7L2ERlL118DMOO+b43LWx3W5JZZ5glApvw+8BPgQoYJdS9iJ347llsZya4BXEs7YcPd1hEo6kuo81sR5QKi06+I8biIkjCXxs3MIB8ysre7+4/j6XwgV/3cIDevxOL36B17Hx3jPd/frCPtqHaGRftrM1sX1nUM4ON5H2BfzCAepNsIJyWrC2ebDwH/EevIdQiN8PWHIopXQgOYDpwDfjes0SDhw1XoUuMHdtxLOeG8jDPv0xuUSpw8B72Bvz+gdhLO42919G+Gg80id+WedQkiIuPugu2+vxuDu62Md3EA4oXFC3Zif+f533b0nrvcPCMkxO+9vZdoOhCS5Pi7vV3EdZsbPtrv7nXG7/Bth/9XGOkg4MN9O2K+LCG1hOqFedxAO1u3AHwMXA78JvI8wbPbnhPazOTPf6rXQUwiJYneM79nhNhqhzd0H3Mq+bSW7fWYCP3D3XYSD74o4/UFCj/qbsf3+Y4yxqkI4636EsO1PzHz2DUIdfppwED0qTr/N3be7+25CD/I3CPWxus9+5e7rCftnK3CIu/exb/2rJmHYtx0CYGYHEdrD7YQT1+/FdbuVcOLdldk28939TsK+OThOv8nde2N9eIrQruo5kVD/d2WPsQ36LeAJd18N4O473H0gfla7jY4Yw3wnJumY2UmEYYQ3uvsiwhBJW52iQzUxlAgHq92EA+UvCQf6mft/dVhOOEN4PWEn3ZyZXl3W9YQz9+Pj9Es8XBc5lnBdINu4d7v7oJn9LeHMYh4hCZ3J3gYC4aB5HKGi9wLEA01/rLDVGJ4EzgXud/fXuvs7MvPojf/3EA6wgzHm4f6CNzt9es08Btn3ceTZsn3AEjN7dQjTaw+otctzwn7JbsMqIxy4f2Vmp8T3DxB6PbMJPapjCftkOmE7lQmNdlWcdiRhCKa6rN7M/A8iNOBzYtxlGnzMOmEbVNdlKP6rvrf4/9oY73GE5OeEeuh1tsN4ZNdlKPN+iOH3T+17G+lzMzP23S+DI8yrOr+hWC+vJrS1owiJtbqv/5VwQNvp7tPd/RWEZHM54YTtDYRe7rTMfHdl5l+7zIFqjDHecpzeH5dR21aG2z79NdP6qm03/nsNw/NhXmffZ/dXtf1l91lvzXfq1cXBmte1ZYzhZeMarg3XxjhSexhvHa63D6vGsvz9TFRP5xDgOXffZWZHEc6S7wdKZlbNymsJXdQTYuY/Fyi5+/OEyrvQ3b9GOLOuHlD7a+8cI/Q8DqqZdjVhLHNbPNurOt7MWuI8ZxK6tYPAhzPznUs46OxhZhcSrjNdSBgyWkUYxjg2U+wWQlf9/ezbELOeIZw5HQLMN7OjzOwYws0Lzw/znecJB3DM7LcJQ2xVzwG/EdfprGG+D+GA/ro4j9Pj8ocIvbN6NyAcbmZvjK/PIwwX/oRw4lC9OFzdXvcBf0IYNvoWobKuict4yt37zexk9p4NrSWcPQ/FM6VthAP+XTHO1wCtsU6cEefXR9gn2cp9G7G3Fi9ADxHqSW1dWGJm0wl1/c2EBFNlhH31S+B/xHUqEc7eXg2cEuvr+wiJcSS3AR+uxmNmB49SvtYSM5tuZi8lDAGtrpn3++JnVevY2xs6n73JH2CmmS2I9eIcwv6rjbXVzF7q7vcSek2vJPQMHyZsk98lnPyVzOxQMzuC0PvZGed3EWFI7I3s77ZYdhaAmR1KOHE4rrqu1D8JHc7zhJGPSnx/Rvy/B3jBzH4/LsfMbFHme7uA88zsVYRe2V2Zz36f0CM5PK77pjHEA6EX8gpC+2tj39GRIfavh3vEXkcX8RKDmZ0S70x7B+EYUbUwrtdbCEl7rO4EzjKzSqY9VdU7bmY9DBxmZsfHGA5q4OaSesfn/UxU0rmFUKnXES403kOodLcAvzCz5whjg8sJ1x66CBcrq3d/rABuMbOdhMZe3RHLgHVmdl1mWT8AjrZ9bxteSehxbK2J60nC9YGbYizvJGTtjcD9cXnLgZPMrMvM3hm/t5TQhb0DeNTMqvP5s+zM3f2bhOT6wUwDyRokXL/5JOGM7aeELvUQ4eBbzzZCxV5HODDel/lsGaG3djvxjqVh/B3hoH8ZoWI/Thgz/0PCMEOth4A/iss8FPiqu3cTxv2Xxltkq4n5A4QDyNcJifw5whnydUCHmXUSkubDsXz14u098f2jhIPcene/nzBk8t8Ivcm7CAeG3yZcV8gOs/4pIem/lZDkhggHyYPN7AlCoiBur5sIwy5fjLFVtRAORucR9sUWwj56lpCQ22J88wn1cyR/Sjgwrour9CgAAAJSSURBVI/xHDNK+VrVOO8BPhmH9QBw9w2Eayc/JPQOjVD3yoQD/xcI27R6Bvoc4QTowRj/DdkFxfn1AT+MQ+CDhO3SB/wT4QSmhdCGphPa5x2EE4KfEu7M+lRc/n53VMX5/2/gd82shzDk+zXgbWZ2H+HaS73h0OG8QLg2tTbGk00edwIfiuuxgb3DxhCG+75MaN+PE64jVm0inOj+17jex48hHtz9iRjPpwnDx32Zj7cD/93Mfkq4iaee84FLCdt4JeG4cST7DuPuJpw8LyVcd7uZcC2t0RjvJ5xUrmVve6q6htCW19Y7VsUhw3OAf4jb9nvsPfkfTr3j834K+dtrFv6O5e/d/cTMtGsIF90b/vsMaVy9bS7NF3s797v7mMbRR5nnjYR9d1uz5plnKRwLzOwOwkX/zqmOpdkK94sEZnYFIav/xVTH8mKhbT454h/z/YTQs2nG/Gaa2c+AnhdLwpGpl0xPJw51fTa+LRG6or3svUsE4O3u/gzjYGb3sv+1mPPjnSr1yn+QMKSS9WN3/2j8/AZgQc3nH3f3W8cR24jLqin7l4Tx6qxvuvunasq9lDD2XuvthCGchrfFMDE3FEem/FWEay5ZX3L3f6op91rC37RkHUQYo856iDAclNXr7ieMFvtIxrpeB7CceuvZUPyZuldtJ7DvheHNhCG1Me3T8Wq0/o5UJ0dq183YJyPNY6zHhpr5Vuv1HML1XAhDtc9Sp36Px3i3W+b7TTtWNSKZpCMiIukr3PCaiIjkl5KOiIhMGiUdERGZNEo6IiIyaf4TRSaQAceksloAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 分析数值型数据缺失情况\n",
    "# % matplotlib inline\n",
    "data_num=data.select_dtypes('number').copy()\n",
    "data_num_miss_rate=1-(data_num.count()/len(data_num))\n",
    "data_num_miss_rate.sort_values(ascending=False,inplace=True)\n",
    "print(data_num_miss_rate[:20])\n",
    "data_num_miss_rate.plot()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Text(0, 0.5, '缺失占比')"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAl8AAAGACAYAAACTPwd6AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAfkUlEQVR4nO3df5hdVX3v8feHJGAEiyCRH6IiFW1FiGKKpYJGLghSgUot0CqK1aJcRG+1WnxAQYu9mireagVBsUVoUVoFRVCQCkgt1gYF1KqVWrQXTY2CBlCRH9/+sXfMOE6SCc5eZ+bk/XqeebLnnHX2/p5DmPlkrbXXSlUhSZKkNjYZdQGSJEkbE8OXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZoTkmQGzrFlkgfMRD2SdH8ZviSNTJJ9knxs0mPHJ3n9FM1fmuTMaZ53+yTnT/HUccCfTeP1T0/ytPW0uSHJI5NsnmTHJNsleVeSV/XHj0gyv287fzp1S9o4+ANB0ij9BFgFPwso9wE/Arboe7oeUFU/TrI98DrgkCSvBo4Ffgo8ALgbKOBXgD2q6tvAi4FrJ5yXqroHuAdYOVUhSZ4E7FZVfwM8pT/v1euofVVfw68Df9S33xN4LPAoYDPgROC7wCeSvKuqLkzyHuB/AbdOONdjgUOr6lPT+MwkzXGGL0kjkeQKukB1X//QIcBJwFbAAuB3gR8kORy4GHgzcF1VLQf+YsI5Tq2qqyac92HAMcCKJG8A/h34YJIX0gW0e5IcBnyiqk6ZUNILgM/3x/fShcDV59wUuLv6Van7Hrh76cLVx4En04WxhwIPoguF366q7/anOBn4SJLr6ULa66vqvAnnv6p/XNJGwPAlqbkkOwOLgeOBPZL8WVW9DvhwkqOB7arqzX3b/ekC1BXAZ5M8taruWsfpjwfeBlwEvLuqDuwfPy3JCcAKuh6rJ0+oZxHwHOC3krwM2A64N8kL+iabAYcBX+/bPo4ufN0MPL2/5t3AS4D/D1wCbJLk0VV1U1V9JsmJ/WtWh83J3G5E2kgYviSNwr7AMuBK4P8Ar0/yQeAxwDy6+fWvBL4DnAM8F7gM+BRdz9X2VfWdtZz7lP4ci+Bnw46pqruBTemGHucDP57wmnfS9aCd3r/mVOA/q+rsKc7/eOBGYFfgPLoQdzhdsPohXc/XkXRzav8JuAmgqs7szz1vLXU7B1faSBi+JI3CFXS9TzsAAZ5UVUdMbJDkq1W1uD9+BbA/cGD/mmuBh/dNz0zyI+DeqlrStzmFLoA9jG7e1uphy4V04euB9MN8STYBLgQ+NOHyD+bnhx0DbNr3uC2gC4K7AlTVR5L8QX+tid5VVRdMOMcWwPbAt4FTkywDbgceQteD9pNpfXKS5ry4sbak1vowswfd3YfPBs6kG447aEKzX6MbbnwbcALwiKraPMk2wGer6tFTzfnqz78I2Bx4N/Ayup62Q4Dd6ELV1+nmZX2Arvfto8DEoczH04W0r64+Jd3Q4+Kqure/xlXAkVW1IslX6YYc7+3bH0bXc/bOCTWdDCyqqpcleQ1d+PwS8LyqOnpDPj9Jc5s9X5JGYR+6JR+uBC6pqhP6Se0fqKoboev5Ap4ALAF+H7hqA87/T8AB/fERdPO8Dga+QRfyDq2f/5fnbqsPkvw63Zytu4AXV9X107jePXRz2FbP59qxv9bqc+5AdxPAE/qHHgd8ZAPej6Qx4hwDSc1V1aer6mnAxDW+7gHOT/LYCe3uq6rPTTMATXTnhOP76HqkDgS+QndH4wunelHfq/YBut6yFwEXJtl3mtc8nG6u15HAkyY993bg7VW1MslD6YZQL5t07Z2SbDbNa0maw+z5kjRK6b+oqvuSvByofpL85H8czu9Xp59ywnrfc3Zfv57XI4H30M3DOpJu0v4JdBP3VwCfSfKNSUtUHEA38X5ZVV3aP3Yk3TIVXwLOAi6bcKflphNqDLC0vzZJXkI3N4wku9P1eB3VB69LgbdU1Y+SFN1wJsCf0N0p+ebpfniS5ibDl6RR2ow14YOq+scku9AthPq+SW0XAK+kWxLiB0mW94+/dfWEeLqJ9h+i63m6paru7gPPZcD/q6rrAJL8Pt2yFgfSDQ9+mq6H7Kiq+pcJ9fxLkscBrwLeSDf0uTp8bdLXRH9t+nM/EHg5Xe8ZVXVjkicCS4HzgbdW1Tv65l8Bnpjk83Rz0J457U9O0pzlhHtJs06S1Az+cEqyW1V9cdJjW1XVbf3xrwFfm8lrTlHDZsD2VXXzUNeQNDcYviRJkhpywr0kSVJDhi9JkqSG5syE+2222aZ22mmnUZchSZK0Xtddd933qmrRVM/NmfC10047sXz58vU3lCRJGrEk31zbcw47SpIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoaLHwlOTvJtUlOWk+705McPFQdkiRJs8kg4SvJYcC8qtoL2DnJLmtptw+wXVVdPEQdkiRJs81QPV9LgQv648uBvSc3SLIAeA9wc5JDpzpJkmOSLE+yfOXKlQOVKkmS1M5Q4Wtz4Jb++FZg2ynaPB/4N2AZsGeS4yc3qKqzqmpJVS1ZtGjKvSklSZLmlKHC1x3Awv54i7Vc54nAWVW1AjgPePpAtUiSJM0aQ4Wv61gz1LgYuHmKNjcBO/fHS4C17v4tSZI0LuYPdN6LgGuS7AA8EzgyyalVNfHOx7OB9yU5ElgAPOf+XOhJr37/L11sa9f9xfNHXYIkSRqRQcJXVa1KshTYH1jWDy3eMKnN7cDvDXF9SZKk2Wqoni+q6jbW3PEoSZIkXOFekiSpKcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGBgtfSc5Ocm2Sk9by/Pwk30pyVf+121C1SJIkzRaDhK8khwHzqmovYOcku0zRbHfg/Kpa2n99cYhaJEmSZpOher6WAhf0x5cDe0/R5jeBZyX5XN9LNn9ygyTHJFmeZPnKlSsHKlWSJKmdocLX5sAt/fGtwLZTtPlXYL+q2hNYABw0uUFVnVVVS6pqyaJFiwYqVZIkqZ1f6G2aIXcAC/vjLZg65N1YVXf1x8uBqYYmJUmSxspQPV/XsWaocTFw8xRtzk2yOMk84HeAGwaqRZIkadYYKnxdBByV5DTgcODLSU6d1OaNwLnA9cC1VXXFQLVIkiTNGoMMO1bVqiRLgf2BZVW1gkk9W1X1Jbo7HiVJkjYaQ835oqpuY80dj5IkScIV7iVJkpoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJamiw8JXk7CTXJjlpPe22TfKFoeqQJEmaTQYJX0kOA+ZV1V7Azkl2WUfztwILh6hDkiRpthmq52spcEF/fDmw91SNkuwL3AmsGKgOSZKkWWWo8LU5cEt/fCuw7eQGSTYFXgecsLaTJDkmyfIky1euXDlIoZIkSS0NFb7uYM1Q4hZruc4JwOlV9YO1naSqzqqqJVW1ZNGiRQOUKUmS1NZQ4es61gw1LgZunqLNfsBxSa4CnpDkvQPVIkmSNGvMH+i8FwHXJNkBeCZwZJJTq+pndz5W1VNXHye5qqpePFAtkiRJs8Yg4auqViVZCuwPLKuqFcAN62i/dIg6JEmSZpuher6oqttYc8ejJEmScIV7SZKkpgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpoWmFryQPnPR9khyYJMOUJUmSNJ7WG76SvBg4fdLDDwWOA943RFGSJEnjajo9X+8DdkyyZPUDVfXfwKHArw5VmCRJ0jhab/iqqvuAlwEnb+hrJUmS9PPmr+vJJF8GbgcK2D7JP094eiFw0YC1SZIkjZ11hq+q2nX1cZLjqupd/fHLgXOr6raB65MkSRorGzJ0ePSE4zuAV85sKZIkSeNvQ8LXjyccnwMclGSbGa5HkiRprK1vztcNwI/6bxdOmPMVYDvgKODtw5UnSZI0XtY352vx2p5LMq+q7p35kiRJksbXdBZZnZfkDZMfN3hJkiRtuHX2fEEXspI8HTg5yUuARcB9/dMLq+p1QxYoSZI0TjZ0odTnAd8CbgFeCHx9xiuSJEkaY+vt+Zqsqt4PkOSlq48lSZI0Peu723ExsCWwZZKnAr8y4ekasjBJkqRxtL6er+cAS4BvA68Fdhy8IkmSpDG2vqUmfm4yfZJrhi1HkiRpvG3wnK8kh9Atsrp1koOr6uKZL0uSJGk8bejdjhcDzwD2Az4JPGvGK5IkSRpjG9TzVVXLhipEkiRpY7BBPV9Jdp9wvGuSzX6ZiyfZOsn+btAtSZI2FtMKX0n+d3/4jgkPvw142jpec3aSa5OctJbntwI+BuwJXJlk0fRKliRJmrum2/P17P7PHwMkeQKwRVVdPlXjJIcB86pqL2DnJLtM0Wx34JVV9SbgMmCPDapckiRpDppu+Lq7/7OSPAQ4A3jROtovBS7ojy8H9p7coKqurqrP9ou37glcO7lNkmOSLE+yfOXKldMsVZIkafZaZ/hK8ookxwI7Jnke3SKrHwWOr6qvreOlm9Pt/whwK7DtWs4f4AjgNtYEvJ+pqrOqaklVLVm0yFFJSZI0962v52slXTC6D1gIbAYsAH51Pa+7o28PsMXarlOd44AbgUOmWbMkSdKctc7wVVV/Rzcp/ttV9R7gG8BvAYckef46Xnoda4YaFwM3T26Q5E8nnOPBwA82rHRJkqS5Z33Djo8GrgAelWQhXWfVPcDRwLFJdlrLSy8CjkpyGnA48OUkp05qc1bf5tPAPLq5YZIkSWNtfXs73pRkL+B5wPvp5nJRVXcn+XPgFcAfT/G6VUmWAvsDy6pqBXDDpDa39c9LkiRtNNa7wn1VFXBukg8Av5HkUVX1n8Cl9GFsLa+7jTV3PEqSJInpL7J6HfAJ4Exg6/7hP6K7U1GSJEnTtM6erySbV9WdwH9X1UFJjgB2T/IK4E66+VySJEmapvUNO364n2j/mCQXAw8HdgPeC/xJVf3C2lySJElau/VNuD8gyabApVV1cJLfB74H7ABcluTVVfULK9NLkiRpatOZ8/UQ4G+SPJxuwdS9+seuAk5Ostlw5UmSJI2X9d7tSLcA6l50C6wuoFtAdSHd/o2/XVX3DladJEnSmFnfhPslwL3AU4Ht+q+vAQEebvCSJEnaMOvr+XoY8Brg68AbgKcAq4BdgF2TPKSqvj9siZIkSeNjfRPuP5Lko8DxwE50Q44rgS8AHwJuH7pASZKkcTLdFe7f0aAWSZKksTetFe4lSZI0MwxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYGC19Jzk5ybZKT1vL8lkk+nuTyJBcm2XSoWiRJkmaLQcJXksOAeVW1F7Bzkl2maPZc4LSqegawAjhwiFokSZJmk/kDnXcpcEF/fDmwN/D1iQ2q6vQJ3y4CvjtQLZIkSbPGUMOOmwO39Me3AtuurWGSvYCtquqzUzx3TJLlSZavXLlymEolSZIaGip83QEs7I+3WNt1kmwNvBP4w6mer6qzqmpJVS1ZtGjRIIVKkiS1NFT4uo5uqBFgMXDz5Ab9BPu/B15bVd8cqA5JkqRZZajwdRFwVJLTgMOBLyc5dVKbFwF7ACcmuSrJEQPVIkmSNGsMMuG+qlYlWQrsDyyrqhXADZPanAGcMcT1JUmSZquh7nakqm5jzR2PkiRJwhXuJUmSmjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLU0GDhK8nZSa5NctI62myb5JqhapAkSZptBglfSQ4D5lXVXsDOSXaZos1WwDnA5kPUIEmSNBsN1fO1FLigP74c2HuKNvcCRwCr1naSJMckWZ5k+cqVK2e8SEmSpNaGCl+bA7f0x7cC205uUFWrquqH6zpJVZ1VVUuqasmiRYsGKFOSJKmtocLXHcDC/niLAa8jSZI0pwwViq5jzVDjYuDmga4jSZI0pwwVvi4CjkpyGnA48OUkpw50LUmSpDlj/hAnrapVSZYC+wPLqmoFcMNa2i4dogZJkqTZaJDwBVBVt7HmjkdJkiThRHhJkqSmDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqSHDlyRJUkOGL0mSpIYMX5IkSQ0ZviRJkhoyfEmSJDVk+JIkSWrI8CVJktSQ4UuSJKkhw5ckSVJDhi9JkqSGDF+SJEkNGb4kSZIaMnxJkiQ1ZPiSJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhgxfkiRJDRm+JEmSGjJ8SZIkNWT4kiRJasjwJUmS1JDhS5IkqaH5oy5A6/atN+426hI2yCNe/8UNav+Udz5loEqG8ZnjPzPttlc/9WkDVjKMp3366mm3/atXXTxgJcN42dsOnnbbNz3vOQNWMowTz/uHabf9yps+NWAlM+/XT9x3g9qfcsopwxQykA2p94K/33O4QgZy+O99btptF//DZQNWMowbnnPABrW350uSJKkhw5ckSVJDhi9JkqSGBgtfSc5Ocm2Sk36ZNpIkSeNkkPCV5DBgXlXtBeycZJf700aSJGncpKpm/qTJO4BPVNWlSY4EFlbVX9+PNscAx/TfPhb42owXu3bbAN9reL3WfH9z2zi/v3F+b+D7m+t8f3NX6/f2yKpaNNUTQy01sTlwS398K7DH/WlTVWcBZw1R4PokWV5VS0Zx7RZ8f3PbOL+/cX5v4Pub63x/c9dsem9Dzfm6A1jYH2+xlutMp40kSdJYGSrwXAfs3R8vBm6+n20kSZLGylDDjhcB1yTZAXgmcGSSU6vqpHW0+c2Barm/RjLc2ZDvb24b5/c3zu8NfH9zne9v7po1722QCfcASbYC9gc+XVUr7m8bSZKkcTJY+JIkSdIvcpK7xk6SrZPsn2SbUdciSdJkhq+1SLJtkmtGXcdMS7Jlko8nuTzJhUk2HXVNM6kfyv4YsCdwZZIp11iZy/q/m18YdR0zLcn8JN9KclX/tduoaxpCktOTHDzqOmZakmMn/Le7PsmZo65pJiXZKsmlSZaP4Xt7VJJLklyT5G2jrmcmTf5dPlt21jF8TaH/BX4O3Vpk4+a5wGlV9QxgBXDgiOuZabsDr6yqNwGXMfUac3PdW1mzTMs42R04v6qW9l9fHHVBMy3JPsB2VXXxqGuZaVV1xur/dsA1wHtGXNJMOwr4236dqAclmRXrRc2QtwB/VlX7ADsmWTriembE5N/ls2lnHcPX1O4FjgBWjbqQmVZVp1fVJ/tvFwHfHWU9M62qrq6qzyZ5Kl3v17WjrmkmJdkXuJMuOI+b3wSeleRz/b9Oh7obeySSLKALJDcnOXTU9QwlycOAbatq+ahrmWHfBx6f5MHAw4H/GnE9M+kxwOf74+8CW46wlpk0+Xf5UuCC/vhy1ix31ZzhawpVtaqqfjjqOoaUZC9gq6r67KhrmWlJQvc/3G3A3SMuZ8b0Q8SvA04YdS0D+Vdgv6raE1gAHDTiemba84F/A5YBeyY5fsT1DOU44IxRFzGAfwIeCbwc+Ardzizj4h+Ak/vh8AOBfxxxPTNiit/lk3fW2bZ9VR3D10YoydbAO4E/HHUtQ6jOccCNwCGjrmcGnQCcXlU/GHUhA7mxqr7THy8HRjYkMJAnAmf1y+qcBzx9xPXMuCSb0L2vq0ZcyhBOBl5aVW8Evgq8cMT1zJiqOhX4OPBi4JyqumPEJQ1l1uysY/jayPS9J38PvLaqvjnqemZakj9N8vz+2wcD4xRU9gOOS3IV8IQk7x1xPTPt3CSLk8wDfge4YdQFzbCbgJ374yXA2P3/B+wD/EuN5xpGWwG79X8/nwyM23u8HngEcNqoCxnQrNlZx3W+1iHJVf3k0bGR5Fjgz1nzi+2MqvrgCEuaUf0EywuAzYAvAceN4y+CMf27+Xjg74AAH62qE0dc0oxK8iDgfXRDHQuA51TVLet+1dyS5M+B5VX14VHXMtOS7An8Nd3Q47XAs8ephyjJG4CbqurcUdcy01b/vEzyK3Q3g/wj/c46o5piZPiSJEkbhdmys47hS5IkqSHnfEmSJDVk+JIkSWrI8CVJA0nykH75BUn6GX8oSJpVkvxOf2fZxMcu7+8WnPjYLkk2SfLcJH+8jvNdluSBfdv055rXLxmwuk2SfCTJjpP3O+33nJzXH++b5C0TnrswyRZJLl/L5c8DnjT9dy9pY2D4kjRr9L1ErwHuTfK4JDv0YWhhVd0+qd15dCtW3wfcNcW5Hp3k9cDdVfUj4AC6W8yfSLfv5xUTgtbJwBfoFmC8st8c+o5+TbUrgYP6a/4U+HGShyZ5JrA9cA+wZZID+uUyVl//uXTb0PzfJFckuTrJd5M8dMY+MElz0ljtnSZpzjuebm+5c+jW4/ln4GjgsUk+Qbdn4BPpNoh/MN2CwdsD85IcQrd+1oVV9Vd0G3XvR7dR8BnAy6vq40k+VlXPgi7E9fssPo5uS5z7quop/XPXT1xLLckL6LaW2bp/6Fa6VbKh+1m6C/Cjvu3BdDtInAn8XVV9P8lfAsuqaqz2U5W04QxfkmaTb9L1Qp0C/C3wX1V1bpJLquq3k1zZ91YdCuxRVXcmOQb4SVW9f9K5fgt4KV3gOTbJsiRLAJLcRLf35weBtwGfoesVe3aSvarqZxuy90OO91XVOUn+gy7QvYFu5OAZfbPbgdP79gF2BQ6j62W7JMn1wL9V1SUz+FlJmqMMX5Jmk+8ATwG2A54APC7JHXS9V0cCDwV+g26V+A91OYedgbuS/EF/jgfS9VC9Bfg+QJLDgddV1V3996cAlwCfp1tR/3Tgmqr6RpIP99cmyRV0PyePB74IzOuv/3LgcODX+mtuR7ef4UlV9Wngzf3rbwN+Anyqqi6Ywc9J0hxm+JI0m6wEVgFfo+sFexLd0ONudPO7jgZurap9AJIs6J+/BziqqlauPlGSXYHX0u3l9kngFUl+m25Pvp2Ag4F30Q1V/nRCDT9ZfVBV+00434uA3wX+i26D879M8rH+6RWrhyj7FbSfAbwAeAzd/nHH9D10WwFXV9Ur7/9HJGmuM3xJmk0W0s2dAngAcAewA3A28PSqOnt1wz54nUO3EfB/ABcnObaqvtA32QT4CPCYPihtSrerx1v6nq+PVdXyJJvRbbh79LoKq6qzk3yNbtjx+H6u2OoJ9tsluRq4HDgf2BR4N93Q6CkTal4KHLjBn4qksWL4kjSbzAMOogsv29BtAH8Y3Z2Kb+03hn8P3aa4JwJnVtX5AEleCLw3yZ3Aq6rqi/3jrwaoqp8m2S7J5hMvWFV39fO0MnmZif7184BNquruCQ//JfB24OL++xXAvsC8vt03kuwNPL//c7Wt6HrhJG3EDF+SZpN/Bw6qqpVJ5gOXAq+pqtuTHE93d+MPgScDf1BV31j9wqr6Sh90DgBumnDOBUk2q6q7quqPk7yErvfp3RPabAY8iG4Jijv6uV7f6//cBPhkkvfR9cCdWlX3AkwIa/Or6j66ZS8mnvP9U/R8HfrLfECS5j431pY0ayVZWFU/nvB96pf8ofXLnCPJgkk9YOtquwldj9k99+daksaX4UuSJKkhV7iXJElqyPAlSZLUkOFLkiSpIcOXJElSQ4YvSZKkhv4HB16ri7qbTKwAAAAASUVORK5CYII=\n",
      "text/plain": [
       "<Figure size 720x432 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 画图了解缺失情况\n",
    "import matplotlib.pyplot as plt\n",
    "import seaborn as sns\n",
    "plt.rcParams['font.family']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus']=False\n",
    "data_num=data.select_dtypes('number').copy()\n",
    "data_num_miss_rate=1-(data_num.count()/len(data_num))\n",
    "data_num_miss_rate.sort_values(ascending=False,inplace=True)\n",
    "fig,ax1=plt.subplots(figsize=(10,6))\n",
    "sns.barplot([1,2,3,4,5,6,7,8,9,10],data_num_miss_rate[:10].values,ax=ax1)\n",
    "ax1.set_title('特征缺失情况')\n",
    "ax1.set_xlabel('缺失特征排名')\n",
    "ax1.set_ylabel('缺失占比')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [
    {
     "ename": "KeyError",
     "evalue": "'reg_preference_for_trad'",
     "output_type": "error",
     "traceback": [
      "\u001B[1;31m---------------------------------------------------------------------------\u001B[0m",
      "\u001B[1;31mKeyError\u001B[0m                                  Traceback (most recent call last)",
      "\u001B[1;32m~\\Anaconda3\\envs\\Nopimal\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001B[0m in \u001B[0;36mget_loc\u001B[1;34m(self, key, method, tolerance)\u001B[0m\n\u001B[0;32m   2645\u001B[0m             \u001B[1;32mtry\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2646\u001B[1;33m                 \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_engine\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mget_loc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   2647\u001B[0m             \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\index.pyx\u001B[0m in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\index.pyx\u001B[0m in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001B[0m in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001B[0m in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;31mKeyError\u001B[0m: 'reg_preference_for_trad'",
      "\nDuring handling of the above exception, another exception occurred:\n",
      "\u001B[1;31mKeyError\u001B[0m                                  Traceback (most recent call last)",
      "\u001B[1;32m<ipython-input-19-357f50622297>\u001B[0m in \u001B[0;36m<module>\u001B[1;34m\u001B[0m\n\u001B[0;32m      2\u001B[0m \u001B[0mdata_str\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mdata\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mselect_dtypes\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mexclude\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;34m'number'\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcopy\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      3\u001B[0m \u001B[0mdata_str\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mdescribe\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m----> 4\u001B[1;33m \u001B[0mdata_str\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'reg_preference_for_trad'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mfillna\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mdata_str\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;34m'reg_preference_for_trad'\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mmode\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m[\u001B[0m\u001B[1;36m0\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m\u001B[0minplace\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;32mTrue\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m      5\u001B[0m \u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m      6\u001B[0m \u001B[0mdic\u001B[0m\u001B[1;33m=\u001B[0m\u001B[1;33m{\u001B[0m\u001B[1;33m}\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32m~\\Anaconda3\\envs\\Nopimal\\lib\\site-packages\\pandas\\core\\frame.py\u001B[0m in \u001B[0;36m__getitem__\u001B[1;34m(self, key)\u001B[0m\n\u001B[0;32m   2798\u001B[0m             \u001B[1;32mif\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcolumns\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mnlevels\u001B[0m \u001B[1;33m>\u001B[0m \u001B[1;36m1\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2799\u001B[0m                 \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_getitem_multilevel\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2800\u001B[1;33m             \u001B[0mindexer\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mcolumns\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mget_loc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   2801\u001B[0m             \u001B[1;32mif\u001B[0m \u001B[0mis_integer\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mindexer\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2802\u001B[0m                 \u001B[0mindexer\u001B[0m \u001B[1;33m=\u001B[0m \u001B[1;33m[\u001B[0m\u001B[0mindexer\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32m~\\Anaconda3\\envs\\Nopimal\\lib\\site-packages\\pandas\\core\\indexes\\base.py\u001B[0m in \u001B[0;36mget_loc\u001B[1;34m(self, key, method, tolerance)\u001B[0m\n\u001B[0;32m   2646\u001B[0m                 \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_engine\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mget_loc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2647\u001B[0m             \u001B[1;32mexcept\u001B[0m \u001B[0mKeyError\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[1;32m-> 2648\u001B[1;33m                 \u001B[1;32mreturn\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_engine\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mget_loc\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0m_maybe_cast_indexer\u001B[0m\u001B[1;33m(\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0m\u001B[0;32m   2649\u001B[0m         \u001B[0mindexer\u001B[0m \u001B[1;33m=\u001B[0m \u001B[0mself\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mget_indexer\u001B[0m\u001B[1;33m(\u001B[0m\u001B[1;33m[\u001B[0m\u001B[0mkey\u001B[0m\u001B[1;33m]\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mmethod\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mmethod\u001B[0m\u001B[1;33m,\u001B[0m \u001B[0mtolerance\u001B[0m\u001B[1;33m=\u001B[0m\u001B[0mtolerance\u001B[0m\u001B[1;33m)\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n\u001B[0;32m   2650\u001B[0m         \u001B[1;32mif\u001B[0m \u001B[0mindexer\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0mndim\u001B[0m \u001B[1;33m>\u001B[0m \u001B[1;36m1\u001B[0m \u001B[1;32mor\u001B[0m \u001B[0mindexer\u001B[0m\u001B[1;33m.\u001B[0m\u001B[0msize\u001B[0m \u001B[1;33m>\u001B[0m \u001B[1;36m1\u001B[0m\u001B[1;33m:\u001B[0m\u001B[1;33m\u001B[0m\u001B[1;33m\u001B[0m\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\index.pyx\u001B[0m in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\index.pyx\u001B[0m in \u001B[0;36mpandas._libs.index.IndexEngine.get_loc\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001B[0m in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;32mpandas\\_libs\\hashtable_class_helper.pxi\u001B[0m in \u001B[0;36mpandas._libs.hashtable.PyObjectHashTable.get_item\u001B[1;34m()\u001B[0m\n",
      "\u001B[1;31mKeyError\u001B[0m: 'reg_preference_for_trad'"
     ]
    }
   ],
   "source": [
    "# 数据处理\n",
    "data_str=data.select_dtypes(exclude='number').copy()\n",
    "data_str.describe()\n",
    "data_str['reg_preference_for_trad'].fillna(data_str['reg_preference_for_trad'].mode()[0],inplace=True)\n",
    "\n",
    "dic={}\n",
    "for i,val in enumerate(list(data_str['reg_preference_for_trad'].unique())):\n",
    "    dic[val]=i\n",
    "data_str['reg_preference_for_trad']=data_str['reg_preference_for_trad'].map(dic)\n",
    "\n",
    "data_str['latest_query_time_month']=pd.to_datetime(data_str['latest_query_time']).dt.month\n",
    "data_str['latest_query_time_weekday']=pd.to_datetime(data_str['latest_query_time']).dt.weekday\n",
    "\n",
    "data_str['loans_latest_time_month']=pd.to_datetime(data_str['loans_latest_time']).dt.month\n",
    "data_str['loans_latest_time_weekday']=pd.to_datetime(data_str['loans_latest_time']).dt.weekday\n",
    "\n",
    "data_str.drop(['latest_query_time','loans_latest_time'],axis=1,inplace=True)\n",
    "for i in data_str.columns:\n",
    "    data_str[i].fillna(data_str[i].mode()[0],inplace=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 划分训练集测试集\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.preprocessing import StandardScaler\n",
    "features=[x for x in data_all.columns if x not in ['status']]\n",
    "X=data_all[features]\n",
    "y=data_all['status']\n",
    "\n",
    "X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 特征归一化\n",
    "std=StandardScaler()\n",
    "X_train=std.fit_transform(X_train)\n",
    "X_test=std.fit_transform(X_test)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 模型评估\n",
    "from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score\n",
    "from sklearn.metrics import roc_auc_score,roc_curve,auc\n",
    "import matplotlib.pyplot as plt\n",
    "\n",
    "def model_metrics(clf,X_train,X_test,y_train,y_test):\n",
    "    # 预测\n",
    "    y_train_pred=clf.predict(X_train)\n",
    "    y_test_pred=clf.predict(X_test)\n",
    "    y_train_proba=clf.predict_proba(X_train)[:,1]\n",
    "    y_test_proba=clf.predict_proba(X_test)[:,1]\n",
    "\n",
    "    # 准确率\n",
    "    print('[准确率]',end='')\n",
    "    print('训练集：{:.4f}'.format(accuracy_score(y_train,y_train_pred)),end='')\n",
    "    print('测试集：{:.4f}'.format(accuracy_score(y_test,y_test_pred)),end='')\n",
    "\n",
    "    # 精准率\n",
    "    print('[精准率]',end='')\n",
    "    print('训练集：{:.4f}'.format(precision_score(y_train,y_train_pred)),end='')\n",
    "    print('测试集：{:.4f}'.format(precision_score(y_test,y_test_pred)),end='')\n",
    "\n",
    "    # 召回率\n",
    "    print('[召回率]',end='')\n",
    "    print('训练集：{:.4f}'.format(recall_score(y_train,y_train_pred)),end='')\n",
    "    print('测试集：{:.4f}'.format(recall_score(y_test,y_test_pred)),end='')\n",
    "\n",
    "    # f1-score\n",
    "    print('[f1-score]',end='')\n",
    "    print('训练集：{:.4f}'.format(f1_score(y_train,y_train_pred)),end='')\n",
    "    print('测试集：{:.4f}'.format(f1_score(y_test,y_test_pred)),end='')\n",
    "\n",
    "    # auc取值：用roc_auc_score或auc\n",
    "    print('[auc值]',end='')\n",
    "    print('训练集：{:.4f}'.format(roc_auc_score(y_train,y_train_proba)),end='')\n",
    "    print('测试集：{:.4f}'.format(roc_auc_score(y_test,y_test_proba)),end='')\n",
    "\n",
    "    # roc曲线\n",
    "    fpr_train,tpr_train,thresholds_train=roc_curve(y_train,y_train_proba,pos_label=1)\n",
    "    fpr_test,tpr_test,thresholds_test=roc_curve(y_test,y_test_proba,pos_label=1)\n",
    "\n",
    "    label=['Train - AUC:{:.4f'.format(auc(fpr_train,tpr_train)),\n",
    "           'Test - AUC:{:.4f'.format(auc(fpr_test,tpr_test))]\n",
    "    plt.plot(fpr_train,tpr_train)\n",
    "    plt.plot(fpr_test,tpr_test)\n",
    "    plt.plot([0,1],[0,1],'d--')\n",
    "    plt.xlabel('False Positive Rate')\n",
    "    plt.ylabel('True Positive Rate')\n",
    "    plt.legend(label,loc=4)\n",
    "    plt.title('ROC curve')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "pycharm": {
     "name": "#%%\n"
    }
   },
   "outputs": [],
   "source": [
    "# 模型\n",
    "from sklearn.model_selection import GridSearchCV\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn import svm\n",
    "from sklearn.tree import DecisionTreeClassifier\n",
    "from lightgbm.sklearn import LGBMClassifier\n",
    "from sklearn.metrics import confusion_matrix\n",
    "from mlxtend.classifier import StackingClassifier\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "plt.rcParams['font.family']=['SimHei']\n",
    "plt.rcParams['axes.unicode_minus']=False\n",
    "\n",
    "# rf\n",
    "rf=RandomForestClassifier(random_state=2018)\n",
    "param={'n_estimators':[40,60,800],'max_depth':[i for i in range(6,10)],\n",
    "       'criterion':['entropy'],'min_samples_split':[5,6,7,8]}\n",
    "gsearch=GridSearchCV(rf,param_grid=param,scoring='roc_auc',cv=4)\n",
    "gsearch.fit(X_train,y_train)\n",
    "print('最佳参数: ',gsearch.best_params_)\n",
    "print('训练集的最佳分数：',gsearch.best_score_)\n",
    "print('测试集的最佳分数: ',gsearch.score(X_test,y_test))\n",
    "\n",
    "rf=RandomForestClassifier(criterion='entropy',max_depth=9,min_samples_split=7,n_estimators=800)\n",
    "rf.fit(X_train,y_train)\n",
    "model_metrics(rf,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# svm_linear\n",
    "svm_linear=svm.SVC(kernel='linear',probability=True).fit(X_train,y_train)\n",
    "model_metrics(svm_linear,X_train,X_test,y_train,y_test)\n",
    "# svm poly\n",
    "svm_poly=svm.SVC(C=0.01,kernel='poly',probability=True).fit(X_train,y_train)\n",
    "model_metrics(svm_poly,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# svm_rbf\n",
    "svm_rbf=svm.SVC(kernel='rbf',probability=True,gamma=0.01,C=0.1)\n",
    "svm_rbf.fit(X_train,y_train)\n",
    "model_metrics(svm_rbf,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# svm_sigmoid\n",
    "svm_sigmoid=svm.SVC(C=0.05,kernel='sigmoid',probability=True)\n",
    "svm_sigmoid.fit(X_train,y_train)\n",
    "model_metrics(svm_sigmoid,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# dt\n",
    "dt=DecisionTreeClassifier(max_depth=9,min_samples_split=100,min_samples_leaf=90,max_features=9)\n",
    "dt.fit(X_train,y_train)\n",
    "model_metrics(dt,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# lr\n",
    "lr=LogisticRegression(C=0.04,penalty='l1')\n",
    "model_metrics(lr,X_train,X_test,y_train,y_test)\n",
    "\n",
    "\n",
    "# lgb\n",
    "lgb=LGBMClassifier(learning_rate=0.1,n_estimators=50,max_depth=3,\n",
    "                  min_child_weight=7,gamma=0,subsample=0.5,colsample_bytree=0.8,\n",
    "                  reg_alpha=1e-5,nthread=4,scale_pos_weight=1)\n",
    "model_metrics(lgb,X_train,X_test,y_train,y_test)\n",
    "\n",
    "# 模型融合\n",
    "sclf_lr=StackingClassifier(classifiers=[lr,svm_linear,svm_rbf,rf,lgb],\n",
    "                           meta_classifier=lr,\n",
    "                           use_probas=True,\n",
    "                           average_probas=True,\n",
    "                           use_features_in_secondary=True)\n",
    "sclf_lr.fit(X_train,y_train.values)\n",
    "model_metrics(sclf_lr,X_train,X_test,y_train,y_test)\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "PyCharm (Nopimal)",
   "language": "python",
   "name": "pycharm-28004bf6"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 1
}